We want to find a relationship betweeen the Attained Education, State, Gender, and the quality of life of US population.
The main assumption is that quality of life is directly proportional to with Salary [USD/hour] and Commute Time to work [hours].
library(dplyr)
library(readr)
library(DT)
library(plotly)
library(ggplot2)
library(RColorBrewer)
library(d3heatmap)
load.data <- FALSE
if(load.data){
datatable(head(acs14pusa,50), options = list(scrollX=T, pageLength = 10))
}
if(load.data){
dim(acs14pusa)
dim(acs14pusb)
}
if(load.data){
relevant.columns <- c("SERIALNO", "ST", "SEX", "AGEP", "SCHL", "INDP", "WKHP", "WAGP", "ESR",
"PINCP", "PERNP", "JWMNP")
acs14pusa.cols <- acs14pusa[,colnames(acs14pusa)%in%relevant.columns]
acs14pusb.cols <- acs14pusb[,colnames(acs14pusb)%in%relevant.columns]
rm(acs14pusa, acs14pusb)
gc()
}
if(load.data){
acs14pus <- rbind(acs14pusa.cols, acs14pusb.cols)
dim(acs14pus)
}
Credits to Arnold Chua Lau (Spring 2016).
if(load.data){
ST.anno = read_csv("./data/statenames.csv")
ST.anno = mutate(ST.anno, STabbr=abbr, STname=name)
acs14pus = mutate(acs14pus, STnum = as.numeric(ST))
acs14pus <- left_join(acs14pus, ST.anno, by = c("STnum" = "code"))
select(sample_n(acs14pus,5), starts_with("ST"))
}
acs14pus$JWMNP <- as.numeric(acs14pus$JWMNP)
acs14pus$WAGP <- as.numeric(acs14pus$WAGP)
acs14pus$WKHP <- as.numeric(acs14pus$WKHP)
acs14pus$STabbr <- as.factor(acs14pus$STabbr)
acs14pus$SCHL <- as.integer(acs14pus$SCHL)
acs14pus$WAGEHOUR <- acs14pus$WAGP / acs14pus$WKHP / 52
industry.categories = read_csv("./data/industry_codes.csv")
Parsed with column specification:
cols(
code = col_integer(),
industry = col_character()
)
education.categories = read_csv("./data/education_codes.csv")
Parsed with column specification:
cols(
code = col_integer(),
education = col_character()
)
acs14pus$SCHL <- as.integer(acs14pus$SCHL)
acs14pus <- left_join(acs14pus, education.categories, by = c("SCHL" = "code"))
if(load.data){
write_csv(x = acs14pus, path = "./output/ss14pus_columns.csv" )
}
summary.mean <- acs14pus %>% group_by(STabbr) %>% summarise(mean(na.omit(JWMNP)),
mean(na.omit(WAGP)),
mean(na.omit(WKHP)),
mean(na.omit(WAGEHOUR))
)
summary.mean[,-1] <- round(summary.mean[,-1],1)
names(summary.mean) <- c("STabbr", "JWMNP", "WAGP", "WKHP", "WAGEHOUR")
datatable(summary.mean, options = list(scrollX=T, pageLength = length(summary.mean$STabbr)))
acs14pus <- acs14pus[order(acs14pus$SCHL),]
plot_ly(x = acs14pus$education,
y = acs14pus$JWMNP,
type = "box",
sort = FALSE) %>%
layout(title = "Commute time by Education Attainment",
xaxis = list(title ="Education"),
yaxis = list(title = "Commute time"),
# width = 1000,
# height = 700,
legend = education.categories$education
)